library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(modelr)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ purrr 1.0.1 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(mlbench)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(smotefamily)
library(usmap)
df_rail <- read_csv("Rail_Equipment_Accident_Incident_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 216100 Columns: 160
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (91): Reporting Railroad Code, Reporting Railroad Name, Accident Number,...
## dbl (67): Report Year, Hazmat Cars, Hazmat Cars Damaged, Hazmat Released Car...
## lgl (2): Adjunct Code 3, Adjunct Code Name 3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_rail)
## # A tibble: 6 × 160
## Reporting Ra…¹ Repor…² Repor…³ Accid…⁴ PDF L…⁵ Accid…⁶ Accid…⁷ Other…⁸ Other…⁹
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 NS Norfol… 2016 120068 https:… 16 04 <NA> <NA>
## 2 NS Norfol… 2016 120068 https:… 16 04 <NA> <NA>
## 3 CR Conrail 1981 0420001 https:… 81 04 <NA> <NA>
## 4 NS Norfol… 2016 120161 https:… 16 04 <NA> <NA>
## 5 NS Norfol… 2016 120161 https:… 16 04 <NA> <NA>
## 6 CSX CSX Tr… 2018 000174… https:… 18 01 <NA> <NA>
## # … with 151 more variables: `Other Accident Number` <chr>,
## # `Other Accident Year` <chr>, `Other Accident Month` <chr>,
## # `Maintenance Railroad Code` <chr>, `Maintenance Railroad Name` <chr>,
## # `Maintenance Accident Number` <chr>, `Maintenance Accident Year` <chr>,
## # `Maintenance Accident Month` <chr>, `Grade Crossing ID` <chr>, Day <chr>,
## # Date <chr>, Time <chr>, `Accident Type Code` <chr>, `Accident Type` <chr>,
## # `Hazmat Cars` <dbl>, `Hazmat Cars Damaged` <dbl>, …
#storing how many accidents occurred per year in a data frame
df <- df_rail %>%
group_by(`Report Year`) %>%
summarise(counts = n())
#dropping NA values from data frame
df <- df %>% drop_na()
#df
ggplot(df, aes(x = `Report Year`, y = counts)) +
geom_bar(stat = "identity") +
labs(x="Report Year",
y="Accident Count",
title="Total Number of Accidents per Year")
Observation: From the above visualization, we can conclude that railroad
accidents have overall decreased in the past 48 years. Additionally, the
accidents reported over the past 10 years have all been around 2500.
#storing total evacuated for each category of visibility in a data frame
df1 <- df_rail %>%
select(`Persons Evacuated`, `Visibility`) %>%
group_by(`Visibility`) %>%
summarise(total_persons_evacuated = sum(`Persons Evacuated`))
#dropping NA values from data frame
df1 <- df1 %>% drop_na()
#df1
ggplot(df1, aes(Visibility, total_persons_evacuated, fill=Visibility))+
geom_bar(stat = "identity") +
geom_text(aes(label = total_persons_evacuated), vjust = -0.3) +
labs(x="Visibility",
y="Total Persons Evacuated",
title="Total Persons Evacuated by Visibility")
Observation: From the above visualization, we can conclude that more
number of people were evacuated during the Day and the least number of
people are evacuated during Dusk.
#storing total accidents for each category of visibility in a data frame
df2 <- df_rail %>%
select(`Persons Evacuated`, `Visibility`) %>%
group_by(`Visibility`) %>%
summarise(count_of_accidents = n())
#dropping NA values from data frame
df2 <- df2 %>% drop_na()
#df2
ggplot(df2, aes(Visibility, count_of_accidents, fill=Visibility))+
geom_bar(stat = "identity") +
geom_text(aes(label = count_of_accidents), vjust = -0.3) +
labs(x="Visibility",
y="Total Number of Accidents",
title="Total Number of Accidents by Visibility")
Observation: From the above visualization, we can conclude that the most
accidents occurred during the Day (with Dark being a close second for
the visibility with the most accidents). The least number of accidents
occurred during Dawn and Dusk.
#storing the proportion of people evacuated for each category of visibility in a data frame
df3 <- df_rail %>%
select(`Persons Evacuated`, `Visibility`) %>%
group_by(`Visibility`) %>%
summarise(evacuation_accident_ratio = sum(`Persons Evacuated`)/n())
#dropping NA values from data frame
df3 <- df3 %>% drop_na()
#df3
ggplot(df3, aes(Visibility, evacuation_accident_ratio, fill=Visibility))+
geom_bar(stat = "identity") +
labs(x="Visibility",
y="Total Persons Evacuated per Accident",
title="Total Persons Evacuated per Accident by Visibility")
Observation: From the above visualization, we can conclude that on
average more people are evacuated during the Day and the least number of
people are evacuated during Dusk.
#storing the total accidents for each state in a data frame
df4 <- df_rail %>%
group_by(`State Name`) %>%
summarise(counts = n())
#dropping NA values from data frame
df4 <- df4 %>% drop_na()
#df4
ggplot(df4, aes(x = `State Name`, y = counts)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 6)) +
labs(x="State Name",
y="Accident Count",
title="Total Number of Accidents per State")
df4Copy <- df4
statepop$full <- toupper(statepop$full)
colnames(df4Copy)[1] <- "full"
total <- merge(statepop,df4Copy,by="full", all.x = TRUE)
total[is.na(total)] <- 0
plot_usmap(data = total, values = "counts", include = c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", "HI"), color = "#ef6c00") +
scale_fill_continuous(low = "white", high = "#ef6c00", name = "Total # of Accidents", label = scales::comma) +theme(legend.position = "right") + labs(title="Total Number of Accidents per State")
Observation: From the above two visualizations, we can conclude that the
most number of accidents are caused in Illinois and Texas followed by
California. The least number of accidents occurred in Rhode Island and
New Hampshire. The number of accidents may be a reflection on how many
trains in general go to that state. Often states with high volumes of
tourists or with large import/exports have a greater chance of
accidents.
df5 <- df_rail
ggplot(df5, aes(y=Temperature)) +
geom_boxplot() +
lims(x=c(-1, 1)) +
labs(title="Distribution of Temperature during Accident",
y="Temperature") +
theme_minimal()
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
Observation: From the above visualization, we can conclude that the
temperatures range from -75 to 815 degrees Fahrenheit. There are a few
outliers where the temperature is extremely high so we can filter out
values greater than 200 degrees Fahrenheit.
#filtering out all temperatures above 200 F and storing data in a data frame
df5_filtered <- filter(df5, Temperature < 200)
ggplot(df5_filtered, aes(y=Temperature)) +
geom_boxplot() +
lims(x=c(-1, 1)) +
labs(title="Distribution of Temperature during Accident",
y="Temperature") +
theme_minimal()
Observation: From the above visualization, we can conclude that the
average temperature during railroad accidents is around 60 degrees
Fahrenheit.
#storing the total accidents by equipment type in a data frame
df6 <- df_rail %>%
group_by(`Equipment Type`) %>%
summarise(count_of_accidents = n())
#dropping NA values from data frame
df6 <- df6 %>% drop_na()
#df6
ggplot(df6, aes(`Equipment Type`, count_of_accidents, fill=`Equipment Type`))+
geom_bar(stat = "identity") +
labs(x="Equipment Type",
y="Total Number of Accidents",
title="Total Number of Accidents by Equipment Type") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 6), legend.key.size=unit(7,"point"))+guides(colour=guide_legend(nrow=3))
Observation: From the above visualization, we can conclude that most of
the accidents were caused when the equipment type was Freight Train
followed by Yard/switching. The least accidents were caused when the
equipment type was DMU.
#The `Equipment Attended` column contains "yes", "no", 0, and 7 values. Since 0 and 7 appear to be incorrect values, they were filtered out from the data set and only Yer or No values were kept.
df7 <- filter(df_rail, `Equipment Attended` == 'No' | `Equipment Attended` == 'Yes')
ggplot(df7, aes(`Equipment Attended`, fill=`Equipment Attended`))+
geom_bar(stat="count",
position="identity") +
labs(x="Equipment Attended",
y="Total Number of Accidents",
title="Total Number of Accidents Based on Equipment Attended")
Observation: From the above visualization, we can conclude that for most
of the accidents, the equipment was attended when the accident took
place.
#filtering out temperture outliers
df8 <- filter(df_rail, Temperature < 200)
ggplot(data = df8,
mapping = aes(x=`Train Speed`, y=`Temperature`)) +
geom_point(alpha=0.1) +
labs(x="Train Speed",
y="Temperature",
title="Temperature vs Train Speed")
## Warning: Removed 2 rows containing missing values (`geom_point()`).
Observation: From the above visualization, we can conclude that
temperatures usually ranged from -50 to 125 degrees Fahrenheit when the
train speed is between 0 and 75. The range to temperatures got smaller
as train speeds increased over 75.
ggplot(df8, aes(x=Temperature)) +
geom_histogram(binwidth=5) +
labs(x="Temperature",
title="Temperature Distribution for Railroad Accidents") +
theme_minimal()
Observation: From the above visualization, we can conclude that
temperature produces a normal distribution and most of the accidents are
caused when the temperature is around 60 degrees Fahrenheit.
ggplot(df8, aes(x=`Train Speed`)) +
geom_histogram(binwidth=5) +
labs(x="Train Speed",
title="Train Speed Distribution for Railroad Accidents") +
theme_minimal()
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).
Observation: From the above visualization, we can conclude that most of
the accidents are caused when the Train speed is between 0 to 15 or when
it is low.
viz_data <- df_rail %>% select(`Weather Condition`, `Weather Condition Code`, `Visibility`, `Visibility Code`)
viz_data <- na.omit(viz_data)
#Remove empty string data
rows_with_empty_strings_visibility <- grepl("^$", viz_data$Visibility)
rows_with_empty_strings_weather <- grepl("^$", viz_data$`Weather Condition`)
viz_data <- viz_data[!rows_with_empty_strings_visibility,]
viz_data <- viz_data[!rows_with_empty_strings_weather,]
viz_data <- viz_data %>%
group_by(`Weather Condition`, Visibility) %>%
summarize(Count = n())
## `summarise()` has grouped output by 'Weather Condition'. You can override using
## the `.groups` argument.
viz_data <- na.omit(viz_data)
ggplot(viz_data, aes(x = `Weather Condition`, y = Count, fill = Visibility)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Weather Condition by Visibility", x = "Weather Condition", y = "Count", fill = "Visibility") +
theme_minimal()
Observation: From the above visualization, we can conclude that the most
accidents were caused in the day with a clear weather. Across all
weather conditions we can notice that most accidents occurred either in
the dark or in the day. The least accidents occurred during sleet.
viz2_data <- df_rail %>% select(`Accident Type`)
ggplot(viz2_data, aes(x = `Accident Type`)) +
geom_bar() +
labs(title = "Top 5 Most Frequent Accident Types",
x = "Accident Type",
y = "Frequency") +
coord_flip() +
theme_classic() +
scale_x_discrete(limits = names(sort(table(viz2_data$`Accident Type`), decreasing = TRUE)[1:5]))
## Warning: Removed 19017 rows containing non-finite values (`stat_count()`).
Observation: From the above visualization, we can conclude that
Derailment was the most frequent type of accident that occurred.
Rear-end collision was the least frequent type of accident.
viz3_data <- df_rail %>% select(`Track Type`, `Accident Type`)
#Pre-processing to remove empty values
rows_with_empty_strings_track <- grepl("^$", viz3_data$`Track Type`)
viz3_data <- viz3_data[!rows_with_empty_strings_track,]
#Visualization
ggplot(viz3_data, aes(x = `Accident Type`, fill = `Track Type`)) +
geom_bar(position = "dodge", width = 0.5) +
labs(title = "Frequency of each accident type by track type",
x = "Accident Type",
y = "Frequency",
fill = "Track Type") +
theme_minimal() +
scale_x_discrete(limits = names(sort(table(viz3_data$`Accident Type`), decreasing = TRUE)[1:5]))
## Warning: Removed 19017 rows containing non-finite values (`stat_count()`).
Observation: From the above visualization, we can conclude that
Derailment was the most frequent type of accident that occurred. Across
all accident types, the accidents often involved either main or yard
track types.
drug_impact <- df_rail %>% select(`Positive Alcohol Tests`, `Positive Drug Tests`)
drug_impact <- na.omit(drug_impact)
drug_impact$`Positive Alcohol Tests` <- as.logical(drug_impact$`Positive Alcohol Tests`)
drug_impact$`Positive Drug Tests` <- as.logical(drug_impact$`Positive Drug Tests`)
counts <- table(drug_impact)
viz4_data <- as.data.frame(counts)
viz4_data$labels <- c("Both Negative", "Only Alcohol", "Only Drug", "Both Positive")
viz4_data <- viz4_data[-1,]
# Create stacked bar plot
ggplot(viz4_data, aes(x = labels, y = Freq)) +
geom_bar(stat = "identity", fill = "maroon") +
labs(y = "Frequency", x="Test", title="Frequency of Each Type of Positive Test")
Observation: From the above visualization, we can conclude that majority
of the tests done showed the presence of drugs. It was pretty rare for
both alcohol and drug tests to be positive simultaneously.
viz5_data <- df_rail %>% select(`Hours Engineers On Duty`, `Minutes Engineers On Duty`, `Hours Conductors On Duty`, `Minutes Conductors On Duty`)
viz5_data$totalMinEngineers = (60 * viz5_data$`Hours Engineers On Duty`) + viz5_data$`Minutes Engineers On Duty`
viz5_data$totalMinConductors = (60 * viz5_data$`Hours Conductors On Duty`) + viz5_data$`Minutes Conductors On Duty`
viz5_data <- na.omit(viz5_data)
engineers_iqr <- IQR(viz5_data$totalMinEngineers)
conductors_iqr <- IQR(viz5_data$totalMinConductors)
engineers_upper <- quantile(viz5_data$totalMinEngineers, 0.75) + 1.5 * engineers_iqr
engineers_lower <- quantile(viz5_data$totalMinEngineers, 0.25) - 1.5 * engineers_iqr
conductors_upper <- quantile(viz5_data$totalMinConductors, 0.75) + 1.5 * conductors_iqr
conductors_lower <- quantile(viz5_data$totalMinConductors, 0.25) - 1.5 * conductors_iqr
# filter the data to remove outliers
viz5_data <- viz5_data %>%
filter(
totalMinEngineers >= engineers_lower,
totalMinEngineers <= engineers_upper,
totalMinConductors >= conductors_lower,
totalMinConductors <= conductors_upper
)
# create the plot with the filtered data
ggplot(viz5_data, aes(x = totalMinEngineers, y = totalMinConductors)) +
geom_point() +
geom_smooth() +
labs(x = "Total Minimum Engineers", y = "Total Minimum Conductors") +
theme_classic()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Observation: From the above visualization, we can conclude that as the
number of minimum engineers increased, the number of minimum conductors
also increases. There is a strong, positive, linear relationship between
the two factors.
viz6_data <- df_rail %>% select(`Train Speed`, `Positive Alcohol Tests`, `Positive Drug Tests`)
viz6_data$isPositiveAlcohol <-as.logical(viz6_data$`Positive Alcohol Tests`)
viz6_data$isPositiveDrug <-as.logical(viz6_data$`Positive Drug Tests`)
viz6_data <- na.omit(viz6_data)
ggplot(viz6_data, aes(x=isPositiveAlcohol, y=`Train Speed`, fill=isPositiveAlcohol)) +
geom_boxplot() +
scale_fill_brewer(palette="Set1", direction=-1) +
labs(y="Train Speed",
title="Higher train speed associated with alcohol") +
theme_minimal() +
stat_summary(fun.y = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
width = .75, linetype = "dashed",
position = position_dodge2(width = 0.75))
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..y..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(y)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Observation: From the above visualization, we can conclude that the
train speed range increases with the presence of a positive alcohol
test. The median (solid line) and mean (dashed line) train speed is also
higher with the presence of a positive alcohol test.
ggplot(viz6_data, aes(x=isPositiveDrug, y=`Train Speed`, fill=isPositiveDrug)) +
geom_boxplot() +
scale_fill_brewer(palette="Set1", direction=-1) +
labs(y="Train Speed",
title="Higher train speed associated with drug") +
theme_minimal() +
stat_summary(fun = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
width = .75, linetype = "dashed",
position = position_dodge2(width = 0.75))
Observation: From the above visualization, we can conclude that the
train speed range increases with the presence of a positive drug test.
The median (solid line) train speed, however, is around 5 regardless of
the outcome of the drug test. The mean (dashed line) train speed also
the same regardless of the outcome of the drug test.
##Regression Testing
set.seed(2)
df_part <- resample_partition(df_rail,p=c(train=0.5, test=0.5))
head(df_part)
## $train
## <resample [108,049 x 160]> 1, 2, 3, 4, 5, 6, 7, 8, 13, 16, ...
##
## $test
## <resample [108,051 x 160]> 9, 10, 11, 12, 14, 15, 18, 19, 20, 23, ...
# Filtering out the required columns and omitting NA values:
df_rail_filtered <- df_rail %>%
select(`Temperature`,`Weather Condition Code`, `Weather Condition`,`Visibility Code`,
`Visibility`, `Train Speed`, `Positive Alcohol Tests`,
`Positive Drug Tests`,`Engineers On Duty`,
`Minutes Engineers On Duty`, `First Car Position`, `Accident Type`,
`Total Persons Injured`) %>%
na.omit()
Total Persons Injured and candidate predictorsggplot(df_rail_filtered, aes(x=(`Weather Condition`), y=(`Total Persons Injured`))) +
geom_point() +
geom_smooth() +
geom_smooth(method="lm", color="red") +
labs(x="Weather Condition", y="Total Persons Injured", title="Total Persons Injured vs Weather Condition") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_rail_filtered, aes(x=(`Train Speed`), y=(`Total Persons Injured`))) +
geom_point() +
geom_smooth() +
geom_smooth(method="lm", color="red") +
labs(x="Train Speed", y="Total Persons Injured", title="Total Persons Injured vs Train Speed Distribution") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_rail_filtered, aes(x=log(`Train Speed`), y=log(`Total Persons Injured`))) +
geom_point() +
geom_smooth() +
geom_smooth(method="lm", color="red") +
labs(x="Train Speed", y="Total Persons Injured" , title="Total Persons Injured vs Train Speed Distribution") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 56363 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 56363 rows containing non-finite values (`stat_smooth()`).
ggplot(df_rail_filtered, aes(x=(`Temperature`), y=(`Total Persons Injured`))) +
geom_point() +
geom_smooth() +
geom_smooth(method="lm", color="red") +
labs(x="Temperature", y="Total Persons Injured", title="Total Persons Injured vs Temperature Distribution") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_rail_filtered, aes(x=log(`Temperature`), y=log(`Total Persons Injured`))) +
geom_point() +
geom_smooth() +
geom_smooth(method="lm", color="red") +
labs(x="Temperature", y="Total Persons Injured", title="Total Persons Injured vs Temperature Distribution") +
theme_minimal()
## Warning in log(Temperature): NaNs produced
## Warning in log(Temperature): NaNs produced
## Warning in log(Temperature): NaNs produced
## Warning in log(Temperature): NaNs produced
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 55793 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 55793 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 445 rows containing missing values (`geom_point()`).
ggplot(df_rail_filtered, aes(x=(`Positive Drug Tests`), y=(`Total Persons Injured`))) +
geom_point() +
geom_smooth() +
geom_smooth(method="lm", color="red") +
labs(x="Positive Drug Tests", y="Total Persons Injured", title="Total Persons Injured vs Positive Drug Tests Distribution") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_rail_filtered, aes(x=log(`First Car Position`), y=log(`Total Persons Injured`))) +
geom_point() +
geom_smooth() +
geom_smooth(method="lm", color="red") +
labs(x="First Car Position", y="Total Persons Injured", title="Total Persons Injured vs First Car Position Distribution") +
theme_minimal()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 55754 rows containing non-finite values (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 55754 rows containing non-finite values (`stat_smooth()`).
# Fitting the linear regression models
fit_1 <- lm((`Total Persons Injured`) ~ (`Weather Condition Code`), data=df_part$train)
fit_2 <- lm((`Total Persons Injured`) ~ (`Train Speed`), data=df_part$train)
fit_3 <- lm((`Total Persons Injured`) ~ (`Temperature`), data=df_part$train)
fit_4 <- lm((`Total Persons Injured`) ~ (`Positive Drug Tests`), data=df_part$train)
fit_5 <- lm((`Total Persons Injured`) ~ (`Minutes Engineers On Duty`), data=df_part$train)
fit_6 <- lm((`Total Persons Injured`) ~ (`First Car Position`), data=df_part$train)
#rsme values were high
rmse(fit_1, df_part$test)
## [1] 4.163183
rmse(fit_2, df_part$test)
## [1] 4.153368
rmse(fit_3, df_part$test)
## [1] 4.159247
rmse(fit_4, df_part$test)
## [1] 5.052945
rmse(fit_5, df_part$test)
## [1] 4.433511
rmse(fit_6, df_part$test)
## [1] 4.256498
Observation: As observed from the above graphs, there is not a strong
linear relationship between the Total Persons Injured and
candidate predictors so we will try other models.
# Transforming the `Total Persons Injured` to a binary isInjured column:
df_rail_filtered$isInjured <-ifelse(df_rail_filtered$`Total Persons Injured` > 0, "Injured", "No Injury")
head(df_rail_filtered)
## # A tibble: 6 × 14
## Temperature Weather …¹ Weath…² Visib…³ Visib…⁴ Train…⁵ Posit…⁶ Posit…⁷ Engin…⁸
## <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 60 1 Clear 1 Dawn 6 0 0 1
## 2 60 1 Clear 1 Dawn 4 0 0 1
## 3 30 1 Clear 2 Day 3 0 0 1
## 4 26 1 Clear 2 Day 5 0 0 1
## 5 72 3 Rain 4 Dark 10 0 0 1
## 6 42 1 Clear 3 Dusk 8 0 0 1
## # … with 5 more variables: `Minutes Engineers On Duty` <dbl>,
## # `First Car Position` <dbl>, `Accident Type` <chr>,
## # `Total Persons Injured` <dbl>, isInjured <chr>, and abbreviated variable
## # names ¹`Weather Condition Code`, ²`Weather Condition`, ³`Visibility Code`,
## # ⁴Visibility, ⁵`Train Speed`, ⁶`Positive Alcohol Tests`,
## # ⁷`Positive Drug Tests`, ⁸`Engineers On Duty`
# Checking the classes and converting isInjured as factor:
df_rail_filtered$isInjured <- as.factor(df_rail_filtered$isInjured)
class(df_rail_filtered$isInjured)
## [1] "factor"
class(df_rail_filtered$`Train Speed`)
## [1] "numeric"
# Creating test and train partitions:
set.seed(1)
train <- createDataPartition(df_rail_filtered$isInjured, p=0.6, list=FALSE)
table(df_rail_filtered$isInjured[train])
##
## Injured No Injury
## 2453 33451
df_train <- df_rail_filtered[as.integer(train),]
df_test <- df_rail_filtered[-as.integer(train),]
# Down sampling:
undersampled_data <- downSample(df_train, y = df_train$isInjured)
# Fitting the model:
fit <- glm(isInjured ~ `Train Speed`, data=undersampled_data, family=binomial(link="logit"))
summary(fit)
##
## Call:
## glm(formula = isInjured ~ `Train Speed`, family = binomial(link = "logit"),
## data = undersampled_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3730 -1.2144 0.3183 1.0660 2.0480
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.448941 0.040447 11.10 <2e-16 ***
## `Train Speed` -0.030190 0.001977 -15.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6801.2 on 4905 degrees of freedom
## Residual deviance: 6536.5 on 4904 degrees of freedom
## AIC: 6540.5
##
## Number of Fisher Scoring iterations: 4
# Calculating the predicted probabilities and plotting histogram:
prob <- predict(fit, newdata=df_test, type="response")
pred <- ifelse(prob > 0.5, "pos", "neg")
hist(prob)
mean(pred == df_test$isInjured, na.rm=TRUE)
## [1] 0
table(pred, df_test$isInjured)
##
## pred Injured No Injury
## neg 747 5186
## pos 888 17114
# Functions for calculating sesnstivity and specificity:
sens <- function(c, p, ref, positive = levels(ref)[2])
{
mean((p > c)[ref == positive], na.rm=TRUE)
}
sens(0.5, prob, df_test$isInjured)
## [1] 0.7674439
spec <- function(c, p, ref, negative = levels(ref)[1])
{
mean((p < c)[ref == negative], na.rm=TRUE)
}
spec(0.5, prob, df_test$isInjured)
## [1] 0.4568807
# Plotting the ROC curve:
roc <- tibble(p=seq(from=0, to=1, by=0.01)) %>%
mutate(sensitivity = map_dbl(p, sens, p=prob, ref=df_test$isInjured),
specificity = map_dbl(p, spec, p=prob, ref=df_test$isInjured),
TPR=sensitivity,
FPR=1 - specificity)
roc
## # A tibble: 101 × 5
## p sensitivity specificity TPR FPR
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 0 1 1
## 2 0.01 1 0 1 1
## 3 0.02 1 0 1 1
## 4 0.03 1 0 1 1
## 5 0.04 1 0 1 1
## 6 0.05 1 0 1 1
## 7 0.06 1.00 0 1.00 1
## 8 0.07 1.00 0.000612 1.00 0.999
## 9 0.08 1.00 0.000612 1.00 0.999
## 10 0.09 1.00 0.000612 1.00 0.999
## # … with 91 more rows
ggplot(roc, aes(x=FPR, y=TPR)) +
geom_path(color="red", size=1) +
geom_vline(xintercept=0, color="green", linetype="dotdash") +
geom_hline(yintercept=1, color="green", linetype="dotdash") +
geom_abline(intercept=0, slope=1, color="blue", linetype="dotted") +
labs(x="False positive rate (1 - specificity)",
y="True positive rate (sensitivity)") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Calculating the AUC value:
sum(roc$TPR[-1] * abs(diff(roc$FPR))) # AUC
## [1] 0.5756423
# Fitting the model using a different function, including down sampling:
fit2 <- train(isInjured ~ `Train Speed`, data=df_train,
method="glm", family=binomial(link="logit"),
preProcess="medianImpute",
trControl=trainControl(method="none",
sampling="down"),
na.action=na.pass)
confusionMatrix(predict(fit2, df_test, na.action=na.pass),
df_test$isInjured)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Injured No Injury
## Injured 747 5186
## No Injury 888 17114
##
## Accuracy : 0.7462
## 95% CI : (0.7407, 0.7517)
## No Information Rate : 0.9317
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1011
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.45688
## Specificity : 0.76744
## Pos Pred Value : 0.12591
## Neg Pred Value : 0.95067
## Prevalence : 0.06831
## Detection Rate : 0.03121
## Detection Prevalence : 0.24788
## Balanced Accuracy : 0.61216
##
## 'Positive' Class : Injured
##
# Testing other models for comparison:
fit3 <- train(isInjured ~ `Temperature`, data=df_train,
method="glm", family=binomial(link="logit"),
preProcess="medianImpute",
trControl=trainControl(method="none",
sampling="down"),
na.action=na.pass)
confusionMatrix(predict(fit3, df_test, na.action=na.pass),
df_test$isInjured)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Injured No Injury
## Injured 854 11354
## No Injury 781 10946
##
## Accuracy : 0.493
## 95% CI : (0.4866, 0.4994)
## No Information Rate : 0.9317
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0033
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.52232
## Specificity : 0.49085
## Pos Pred Value : 0.06995
## Neg Pred Value : 0.93340
## Prevalence : 0.06831
## Detection Rate : 0.03568
## Detection Prevalence : 0.51005
## Balanced Accuracy : 0.50659
##
## 'Positive' Class : Injured
##